R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Preliminary Data Wrangling and Cleaning

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- read_csv("AirBnB.csv")
## Rows: 7833 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): host_name, host_since_anniversary, Customer Since, neighbourhood_c...
## dbl (22): host_id, host_since_year, Age in years, id, latitude, longitude, a...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>% 
  dplyr::select(
    -host_id,
    -host_name,
    -host_since_year,
    -host_since_anniversary,
    -matches("Customer Since"),
    -zipcode,
    -id, 
    -city,
    -state,
    -country,
    -latitude,
    -longitude,
    -matches("customers...50..review.rate"),
    -number_of_reviews
    )

df <- df %>%
  rename(
    age_of_exp = "Age in years",
    total_revenue = "Total Rev",
    rev_per_2_guest = "Daily Rev per 2 guests, unless limited to 1",
    min_cost_per_night = "Min Nights",
    neighbourhood = neighbourhood_cleansed,
    city = city_translated,
    state = state_translated,
  )
df$total_revenue <- as.character(df$total_revenue)
df$total_revenue_clean <- gsub("[$,]", "", df$total_revenue)
df <- df[!is.na(suppressWarnings(as.numeric(df$total_revenue_clean))), ]
df$total_revenue_clean <- as.numeric(df$total_revenue_clean)
df$total_revenue <- NULL
df$log_total_revenue <- log(df$total_revenue_clean + 1)

df$rev_per_2_guest <- as.character(df$rev_per_2_guest)
df$rev_per_2_guest_clean <- gsub("[$,]", "", df$rev_per_2_guest)
df <- df[!is.na(suppressWarnings(as.numeric(df$rev_per_2_guest_clean))), ]
df$rev_per_2_guest_clean <- as.numeric(df$rev_per_2_guest_clean)
df$rev_per_2_guest <- NULL

df$host_response_rate <- as.character(df$host_response_rate)
df <- df[!is.na(suppressWarnings(as.numeric(df$host_response_rate))), ]
df$host_response_rate <- as.numeric(df$host_response_rate)

df$price <- as.character(df$price)
df$price <- as.numeric(gsub("[$[:space:]]", "", df$price))
## Warning: NAs introduced by coercion
df <- df[!is.na(suppressWarnings(as.numeric(df$price))), ]
df$price <- as.numeric(df$price)
df$log_price <- log(df$price + 1)
df$log_review_scores_value <- log(df$review_scores_value + 1)
df$log_age_of_exp <- log(df$age_of_exp + 1)
df$sqrt_age_of_exp <- sqrt(df$age_of_exp)
df$log_review_scores_rating <- log(df$review_scores_rating + 1)
df$log_accommodates <- log(df$accommodates + 1)
df$sqrt_accommodates <- sqrt(df$accommodates)
df$log_host_response_rate <- log(df$host_response_rate + 1)
df$minimum_nights <- as.numeric(gsub("[$[:space:]]", "", df$minimum_nights))

Final Data Cleaning (Removing all columns unused in analysis)

df$neighbourhood <- NULL
df$city <- NULL
df$state <- NULL
df$bathrooms <- NULL
df$bedrooms <- NULL
df$beds <- NULL
df$bed_type <- NULL
df$guests_included <- NULL
df$extra_people <- NULL
df$minimum_nights <- NULL
df$review_scores_accuracy <- NULL
df$review_scores_rating <- NULL
df$review_scores_cleaniness <- NULL
df$review_scores_checkin <- NULL
df$review_scores_communication <- NULL
df$review_scores_location <- NULL
df <- na.omit(df)
head(df)
## # A tibble: 6 × 21
##   age_of_exp property_type room_type       accommodates price min_cost_per_night
##        <dbl> <chr>         <chr>                  <dbl> <dbl> <chr>             
## 1       8.93 Apartment     Entire home/apt            4   130 $520              
## 2       8.8  Apartment     Private room               2    59 $207              
## 3       8.74 Apartment     Entire home/apt            4    95 $285              
## 4       8.62 Apartment     Entire home/apt            2   100 $220              
## 5       8.57 Apartment     Entire home/apt            6   250 $500              
## 6       8.57 Apartment     Private room               2   115 $115              
## # ℹ 15 more variables: host_response_time <chr>, host_response_rate <dbl>,
## #   review_scores_cleanliness <dbl>, review_scores_value <dbl>,
## #   total_revenue_clean <dbl>, log_total_revenue <dbl>,
## #   rev_per_2_guest_clean <dbl>, log_price <dbl>,
## #   log_review_scores_value <dbl>, log_age_of_exp <dbl>, sqrt_age_of_exp <dbl>,
## #   log_review_scores_rating <dbl>, log_accommodates <dbl>,
## #   sqrt_accommodates <dbl>, log_host_response_rate <dbl>

Response Variable Analysis

hist(df$total_revenue_clean/1000, breaks=40, main = "Histogram of the distribution of Total Revenue", xlab = "Total Revenue ($1000)") # Not normally distributed, likely not good for linear regression unless undergoes transformation
<<<<<<< HEAD

hist(df$log_total_revenue, breaks=20, main = "Histogram of the distribution of Log Total Revenue") # Looks more normally distributed, more fit for linear regression, thus we investigate log total revenue

boxplot(df$log_total_revenue, main = "Boxplot of the distribution of Log Total Revenue", ylab = "Log Total Revenue") # helps better visualize distribution is less skewed

Continuous Predictor Variables Analysis

=======

hist(df$log_total_revenue, breaks=20, main = "Histogram of the distribution of Log Total Revenue") # Looks more normally distributed, more fit for linear regression, thus we investigate log total revenue

boxplot(df$log_total_revenue, main = "Boxplot of the distribution of Log Total Revenue", ylab = "Log Total Revenue") # helps better visualize distribution is less skewed

Continuous Predictor Variables Analysis

>>>>>>> main

Categorical Predictor Variables Analysis

Checking Property Type

df_property_type <- df %>%
  add_count(property_type) %>%
  filter(n >= 10) %>%           
  select(-n)
boxplot(log_total_revenue ~ property_type, data = df_property_type,
        main = "Log Revenue by Property Type",
        ylab = "Log Total Revenue", las = 1, cex.axis = 0.7) 
<<<<<<< HEAD

=======

>>>>>>> main
# Not Very Suitable by observation

check if room_type is good for adding into our model

boxplot(log_total_revenue ~ room_type, data = df,
        main = "Log Revenue by Room Type",
        ylab = "Log Total Revenue", xlab = "Room Type", las = 1, cex.axis = 0.7)
<<<<<<< HEAD

=======

>>>>>>> main
# We see a good distinguishing factor between all 3 variables, especially 
# between shared room and others

check if host_response_time is good for adding into our model

boxplot(log_total_revenue ~ host_response_time, data = df,
        main = "Log Revenue by host response time",
        ylab = "Log Total Revenue", las = 1, cex.axis = 0.7)
<<<<<<< HEAD

=======

>>>>>>> main
# We see a non-trivial relationship here, but it is marginal compared to 
# room_type as a categorical predictor

from the box plot, we think room type is the best categorical data to use in our model

Fitting Linear Model

## 
## Call:
## lm(formula = log_total_revenue ~ log_age_of_exp + log_price + 
##     review_scores_value + sqrt_accommodates + host_response_rate + 
##     room_type, data = as.data.frame(df), y = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9443 -0.8645  0.0594  0.9042  4.0434 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -0.10429    0.29234  -0.357    0.721    
## log_age_of_exp         2.30491    0.07910  29.139  < 2e-16 ***
## log_price              0.48994    0.04546  10.776  < 2e-16 ***
## review_scores_value    0.10600    0.01919   5.524 3.46e-08 ***
## sqrt_accommodates      0.31886    0.04735   6.734 1.81e-11 ***
## host_response_rate     0.94364    0.11170   8.448  < 2e-16 ***
## room_typePrivate room -0.07659    0.04814  -1.591    0.112    
## room_typeShared room  -0.87323    0.21942  -3.980 6.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 5694 degrees of freedom
## Multiple R-squared:  0.2165, Adjusted R-squared:  0.2156 
## F-statistic: 224.8 on 7 and 5694 DF,  p-value: < 2.2e-16
plot(log_model_with_cat$fitted.values, df$log_total_revenue, cex = 0.01, abline(0,1, col="red"), main = "Fitted Log Total Revenue vs Actual Log Total Revenue", xlab = "Fitted Log Total Revenue", ylab = "Actual Log Total Revenue")
<<<<<<< HEAD

=======

>>>>>>> main

Residual Analysis

Residual vs Fitted Plot: Looking for zero-mean, uni-variance distribution <<<<<<< HEAD

=======

>>>>>>> main

Box-Cox Attempt

Attempt to optimize model with further Box-Cox transformation on top of log.

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(psych)
b <- boxcox(log_model_with_cat)
<<<<<<< HEAD

=======

>>>>>>> main
lambda <- b$x[which.max(b$y)]
lambda
## [1] 1.151515
geom_mean <- geometric.mean(df$log_total_revenue) # number of observations without NA
log_total_revenue_transformed <- geom_mean ^ (1-lambda) * (df$log_total_revenue^lambda - 1) / lambda
df$log_total_revenue_transformed <- log_total_revenue_transformed
box_cox_log_model_with_cat <- lm(
  log_total_revenue_transformed ~ log_age_of_exp + log_price + review_scores_value + sqrt_accommodates + host_response_rate + room_type, 
  data = df, y = TRUE
)
summary(box_cox_log_model_with_cat)
## 
## Call:
## lm(formula = log_total_revenue_transformed ~ log_age_of_exp + 
##     log_price + review_scores_value + sqrt_accommodates + host_response_rate + 
##     room_type, data = df, y = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8632 -0.8717  0.0444  0.8959  4.2136 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -1.80055    0.29219  -6.162 7.67e-10 ***
## log_age_of_exp         2.31110    0.07906  29.233  < 2e-16 ***
## log_price              0.48899    0.04544  10.761  < 2e-16 ***
## review_scores_value    0.10537    0.01918   5.494 4.10e-08 ***
## sqrt_accommodates      0.32137    0.04733   6.790 1.23e-11 ***
## host_response_rate     0.94504    0.11164   8.465  < 2e-16 ***
## room_typePrivate room -0.07089    0.04812  -1.473 0.140728    
## room_typeShared room  -0.85165    0.21931  -3.883 0.000104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 5694 degrees of freedom
## Multiple R-squared:  0.2168, Adjusted R-squared:  0.2159 
## F-statistic: 225.2 on 7 and 5694 DF,  p-value: < 2.2e-16
plot(box_cox_log_model_with_cat, which = 1, cex = 0.01) # Minimal improvement observed
<<<<<<< HEAD

plot(box_cox_log_model_with_cat, which = 2, cex=0.7) # Minimal improvement observed

=======

plot(box_cox_log_model_with_cat, which = 2, cex=0.7) # Minimal improvement observed

>>>>>>> main

We see that Box-Cox does not help out too much in our residuals, and it also reduces interpretability of our model by adding complexity to our transformed predictor.